Main question to address: What areas of NYC are under-serviced by ADA accessible stations?

Plan: Somehow randomly “sample” locations in the NYC territory in longitude, latitude units and then calculate distance from that point to the nearest accessible subway station. Normalize to nearest (accessible or not accessible) station?

Need: - data for which stations are ada accessible - data for station locations (longitude, latitude) - some way to sample the territory of NYC (not including Staten Island and all the rivers)

library(tidyverse)
library(ggmap)
library(stringr)
library(cowplot)
library(viridis)
library(geosphere)

Not sure which are the most reliable datasets to use here.

ada.raw.data <- read_csv(file = "../data/Elevator Escalator Station Data.csv") %>% 
  arrange(station)
sub.location.raw.data <- read_csv(file = "../data/NYC_Transit_Subway_Entrance_And_Exit_Data.csv") %>% 
  arrange(`Station Name`)
colnames(sub.location.raw.data)[3] <- "station"
colnames(sub.location.raw.data) <- colnames(sub.location.raw.data) %>% 
  str_to_lower() %>% 
  gsub(pattern = " ", replacement = ".")
colnames(ada.raw.data) <- colnames(ada.raw.data) %>% 
  str_to_lower()

Mucking around

# What are the cols and how many missing values per col?
sub.location.raw.data %>% 
  is.na %>% 
  as.data.frame() %>% 
  sapply(sum)
##           division               line            station 
##                  0                  0                  0 
##   station.latitude  station.longitude             route1 
##                  0                  0                  0 
##             route2             route3             route4 
##                848               1374               1547 
##             route5             route6             route7 
##               1630               1741               1788 
##             route8             route9            route10 
##               1820               1840               1845 
##            route11      entrance.type              entry 
##               1845                  0                  0 
##          exit.only            vending           staffing 
##               1812                  0                  0 
##        staff.hours                ada          ada.notes 
##               1828                  0               1793 
##     free.crossover north.south.street   east.west.street 
##                  0                 29                 35 
##             corner  entrance.latitude entrance.longitude 
##                 32                  0                  0 
##   station.location  entrance.location 
##                  0                  0
ada.raw.data %>% 
  is.na %>% 
  as.data.frame() %>% 
  sapply(sum)
##       station       borough       trainno   equipmentno equipmenttype 
##             0             0             0             0             0 
##       serving           ada 
##             0             0
full.join.mess <- ada.raw.data %>% 
  full_join(sub.location.raw.data, by = "station") %>% 
  arrange(station)
inner.join.mess <- ada.raw.data %>% 
  inner_join(sub.location.raw.data, by = "station") %>% 
  arrange(station)
full.join.mess %>% 
  filter(!(station %in% unique(inner.join.mess$station)))
## # A tibble: 2,046 x 38
##     station borough trainno equipmentno equipmenttype serving ada.x
##       <chr>   <chr>   <chr>       <chr>         <chr>   <chr> <chr>
##  1 103rd St    <NA>    <NA>        <NA>          <NA>    <NA>  <NA>
##  2 103rd St    <NA>    <NA>        <NA>          <NA>    <NA>  <NA>
##  3 103rd St    <NA>    <NA>        <NA>          <NA>    <NA>  <NA>
##  4 103rd St    <NA>    <NA>        <NA>          <NA>    <NA>  <NA>
##  5 103rd St    <NA>    <NA>        <NA>          <NA>    <NA>  <NA>
##  6 103rd St    <NA>    <NA>        <NA>          <NA>    <NA>  <NA>
##  7 103rd St    <NA>    <NA>        <NA>          <NA>    <NA>  <NA>
##  8 103rd St    <NA>    <NA>        <NA>          <NA>    <NA>  <NA>
##  9 103rd St    <NA>    <NA>        <NA>          <NA>    <NA>  <NA>
## 10 103rd St    <NA>    <NA>        <NA>          <NA>    <NA>  <NA>
## # ... with 2,036 more rows, and 31 more variables: division <chr>,
## #   line <chr>, station.latitude <dbl>, station.longitude <dbl>,
## #   route1 <chr>, route2 <chr>, route3 <chr>, route4 <chr>, route5 <chr>,
## #   route6 <chr>, route7 <chr>, route8 <int>, route9 <int>, route10 <int>,
## #   route11 <int>, entrance.type <chr>, entry <chr>, exit.only <chr>,
## #   vending <chr>, staffing <chr>, staff.hours <chr>, ada.y <lgl>,
## #   ada.notes <chr>, free.crossover <lgl>, north.south.street <chr>,
## #   east.west.street <chr>, corner <chr>, entrance.latitude <dbl>,
## #   entrance.longitude <dbl>, station.location <chr>,
## #   entrance.location <chr>
# need to work on this
rm(ada.raw.data, full.join.mess)

The Subway Entrance and Exit data seems to be good enough for an initial analysis. Hopefully the ADA column in that dataset is reliable. Need to compare this with the other datasets collected at some point, but joining is a problem. To do: - check against Wire Monkey’s joined dataset

For now, stick with the sub.location.raw.data for the rest of the analysis.

Map of all subway stations in this dataset:

# map of all the stations
nyc.map <- get_map(location = "New York City", maptype = "roadmap")
ggmap(nyc.map) +
  geom_point(data = sub.location.raw.data, aes(x = station.longitude, y = station.latitude)) +
  ylim(40.495992, 40.915568) +  # NYC city limits latitude coordinates
  xlim(-74.257159, -73.699215) +  # NYC city limits longitude coordinates
  xlab("longitude") +
  ylab("latitude") +
  ggtitle("All NYC Subway Stations\nin the Subway Entrance/Exit Locations Data")
## Warning: Removed 1 rows containing missing values (geom_rect).

# Note: no stations on Staten Island - will exclude it in the following analysis

Notes: - Most stations have more than one row (multiple entrances/exits) - need to compress info - Station names in the station column are NOT unique (multiple 103rd St, etc) in the entrance/exit data, but station location may be a unique consistent handle to grab all the entrances/exits associated with a station - For now, will call any station that has at least one ADA-accessible entrance/exit “accessible”, but we know that some stations are only partially accessible. Different train lines are serviced by different platforms within the same stations, and not all are fully accessible. This is where the elevator/escalator data joining may be useful for future analysis.

head(sub.location.raw.data$ada)
## [1] FALSE FALSE FALSE FALSE FALSE FALSE
sub.loc.ada.sum <- sub.location.raw.data %>% 
  group_by(station.location) %>%  # seems to be the best unique key for stations in the dataset
  mutate(ada.sum = sum(ada)) %>%  # which stations are ada accessible? 
  select(division:entrance.type, ada, station.location, ada.sum) %>% 
  unique()
sub.loc.ada.sum %>% 
  ungroup() %>% 
  arrange(desc(ada.sum)) %>% 
  select(division:station, entrance.type:ada, ada.sum)
## # A tibble: 585 x 6
##    division             line                        station entrance.type
##       <chr>            <chr>                          <chr>         <chr>
##  1      IND         6 Avenue 47-50th Sts Rockefeller Center      Easement
##  2      IND         6 Avenue 47-50th Sts Rockefeller Center      Elevator
##  3      IND         6 Avenue 47-50th Sts Rockefeller Center         Stair
##  4      IND         6 Avenue 47-50th Sts Rockefeller Center         Stair
##  5      IND         8 Avenue                        34th St      Elevator
##  6      IND         8 Avenue                        34th St         Stair
##  7      IND Queens Boulevard               Jamaica-179th St      Elevator
##  8      IND Queens Boulevard               Jamaica-179th St         Stair
##  9      IND         8 Avenue                        59th St      Easement
## 10      IND         8 Avenue                        59th St      Elevator
## # ... with 575 more rows, and 2 more variables: ada <lgl>, ada.sum <int>
sub.loc.ada.sum %>% 
  ungroup() %>% 
  arrange(desc(station)) %>% 
  select(division:station, entrance.type:ada, ada.sum)
## # A tibble: 585 x 6
##    division      line                 station entrance.type   ada ada.sum
##       <chr>     <chr>                   <chr>         <chr> <lgl>   <int>
##  1      IRT    Pelham               Zerega Av         Stair FALSE       0
##  2      IND  6 Avenue                 York St         Stair FALSE       0
##  3      IND Concourse Yankee Stadium-161st St      Elevator  TRUE       8
##  4      IND Concourse Yankee Stadium-161st St         Stair  TRUE       8
##  5      IND Concourse Yankee Stadium-161st St          Door  TRUE       8
##  6      IRT    Jerome Yankee Stadium-161st St      Elevator  TRUE       8
##  7      IRT    Jerome Yankee Stadium-161st St         Stair  TRUE       8
##  8      IND  8 Avenue      World Trade Center         Stair  TRUE       8
##  9      IRT  Flushing     Woodside Av-61st St     Escalator  TRUE       3
## 10      IRT  Flushing     Woodside Av-61st St         Stair  TRUE       3
## # ... with 575 more rows

Looking over the result, stations with only “Stair” entrances are not ada-accessible (ada.sum == 0), but stations with “Elevator” and “Escalator” entrances have more than one ada-accessible entrance (ada.sum > 0). Will call stations with ada.sum > 0 ADA-Accessible. The total value of the ada.sum probably doesn’t matter much on it’s own. Larger stations with more entrances will have a higher ada.sum score just because of the number of entrances.

sub.loc.ada.unique <- sub.loc.ada.sum %>% 
  select(division:route11, station.location:ada.sum) %>% 
  unique() %>% 
  ungroup() %>% 
  arrange(ada.sum, station, line) 
# Probably don't need to keep dragging the route information around, but who knows

sub.loc.ada.annot <- sub.loc.ada.unique %>% 
  filter(ada.sum == 0) %>% 
  mutate(ada.class = "not.ada") %>% 
  bind_rows(
    sub.loc.ada.unique %>% 
      filter(ada.sum > 0) %>% 
      mutate(ada.class = "ada.complnt")
    ) %>% 
  arrange(station)

sub.loc.ada.annot %>% 
  ggplot(aes(x = station.longitude, y = station.latitude, color = ada.class)) +
  geom_point(size = 3, alpha = 0.5) +
  coord_quickmap()

ggmap(nyc.map) +
  geom_point(
    data = sub.loc.ada.annot, 
    aes(x = station.longitude, y = station.latitude, color = ada.class),
    size = 3) +
  ylim(40.55, 40.915568) +  # NYC city limits latitude coordinates
  xlim(-74.1, -73.699215) +  # NYC city limits longitude coordinates
  xlab("longitude") +
  ylab("latitude") +
  ggtitle("All NYC Subway Stations\nin the Subway Entrance/Exit Locations Data\nColor by Accessibility")
## Warning: Removed 1 rows containing missing values (geom_rect).

Now the next questions is how do I “sample” points from the area of NYC?

The NYC map shapefile might be useful?

Borough boundary shapefile from: http://www1.nyc.gov/site/planning/data-maps/open-data/districts-download-metadata.page

NYC city limits? (same source as above)

West Longitude: -74.257159 East Longitude: -73.699215 North Latitude: 40.915568 South Latitude: 40.495992

library(rgdal)
nyc.shapefile <- readOGR(dsn = "../data/nybb_17c/nybb.shp")
nyc.map.df <- fortify(nyc.shapefile)
nyc.shapefile.map <- ggplot() +
  geom_path(
    data = nyc.map.df, aes(x = long, y = lat, group = group), size = 0.2
    ) 

The latitude and longitude look like they’re in different units from what I’m used to. Not sure how to make use of this.

I’ve worked a little bit with the NYC 311 call dataset before. Source: https://nycopendata.socrata.com/Social-Services/311-Service-Requests-from-2010-to-Present/erm2-nwe9

Remembered that it has latitude/longitude location info for the calls. Grabbed the locations from the 2015 NYC 311 calls data - use as random sample?

nyc.311.loc <- read_csv(file = "../data/nyc_311_2015_locations.csv") %>% 
  filter(Borough == "BROOKLYN" | Borough == "BRONX" | Borough == "MANHATTAN" | Borough == "QUEENS") %>% 
  unique() %>%  # a lot of calls come from an identical lat/long
  filter(Longitude > -78.0)  # there's one odd point out
# too many points to just plot, but here's a density plot:

# locations
nyc.311.loc %>% 
  ggplot(aes(x = Longitude, y = Latitude)) +
  geom_bin2d(bins = 300) +
  scale_fill_viridis(option = "B") +
  coord_quickmap()

# can do a geom_point map with all points, but it takes a long time
# uncomment if you want to see:
#nyc.311.loc %>% 
#  ggplot(aes(x = Longitude, y = Latitude)) +
#  geom_point(alpha = 0.2, size = 0.25) +
#  coord_quickmap()

Not bad. Empty areas are parks probably, should be okay to not include those. Let’s see what happens.

Mini-test on a small portion of the subway stations and the 311 locations:

sub.loc.ada.annot <- sub.loc.ada.annot %>% 
  mutate(station.id = paste(station, division, line, sep = " / ")) 
sub.loc.ada.annot %>% 
  group_by(station.id) %>% 
  count() %>% 
  filter(n > 1)
## # A tibble: 5 x 2
## # Groups:   station.id [5]
##                                        station.id     n
##                                             <chr> <int>
## 1 47-50th Sts Rockefeller Center / IND / 6 Avenue     2
## 2     Brooklyn Bridge-City Hall / IRT / Lexington     8
## 3    Canarsie - Rockaway Parkway / BMT / Canarsie     3
## 4                      Chambers St / BMT / Nassau     2
## 5   Forest Hills-71st Av / IND / Queens Boulevard     2
# there's some dublicate stations still, must have had several station locations associated with those
sub.loc.ada.annot <- sub.loc.ada.annot %>% 
  group_by(station.id) %>% 
  mutate(avg.lat = mean(station.latitude)) %>% 
  mutate(avg.long = mean(station.longitude)) %>% 
  select(station.id, division:station, avg.lat:avg.long, ada.class) %>% 
  ungroup() %>% 
  unique()
nrow(sub.loc.ada.annot)
## [1] 465
length(sub.loc.ada.annot$station.id)
## [1] 465

There’s still some duplicates in the stations that have slightly different route assignments, but similar location. I’m going to leave them in for now, not sure what to do with them.

# small scale test
# pick 10 locations in NYC and find the distance from that point to 5 stations in the city
loc.test <- nyc.311.loc %>% 
  slice(1:10)
t1 <- rbind(sub.loc.ada.annot$station.id, sub.loc.ada.annot$station.id)
t2 <- as.data.frame(rbind(t1, t1, t1, t1, t1), stringsAsFactors = FALSE)
colnames(t2) <- paste("stat", 1:ncol(t2), sep = "")
t3 <- t2 %>% select(1:5)
t3
##                        stat1                             stat2
## 1  103rd St / IND / 8 Avenue 103rd St / IRT / Broadway-7th Ave
## 2  103rd St / IND / 8 Avenue 103rd St / IRT / Broadway-7th Ave
## 3  103rd St / IND / 8 Avenue 103rd St / IRT / Broadway-7th Ave
## 4  103rd St / IND / 8 Avenue 103rd St / IRT / Broadway-7th Ave
## 5  103rd St / IND / 8 Avenue 103rd St / IRT / Broadway-7th Ave
## 6  103rd St / IND / 8 Avenue 103rd St / IRT / Broadway-7th Ave
## 7  103rd St / IND / 8 Avenue 103rd St / IRT / Broadway-7th Ave
## 8  103rd St / IND / 8 Avenue 103rd St / IRT / Broadway-7th Ave
## 9  103rd St / IND / 8 Avenue 103rd St / IRT / Broadway-7th Ave
## 10 103rd St / IND / 8 Avenue 103rd St / IRT / Broadway-7th Ave
##                        stat3                      stat4
## 1  103rd St / IRT / Flushing 103rd St / IRT / Lexington
## 2  103rd St / IRT / Flushing 103rd St / IRT / Lexington
## 3  103rd St / IRT / Flushing 103rd St / IRT / Lexington
## 4  103rd St / IRT / Flushing 103rd St / IRT / Lexington
## 5  103rd St / IRT / Flushing 103rd St / IRT / Lexington
## 6  103rd St / IRT / Flushing 103rd St / IRT / Lexington
## 7  103rd St / IRT / Flushing 103rd St / IRT / Lexington
## 8  103rd St / IRT / Flushing 103rd St / IRT / Lexington
## 9  103rd St / IRT / Flushing 103rd St / IRT / Lexington
## 10 103rd St / IRT / Flushing 103rd St / IRT / Lexington
##                                         stat5
## 1  104th St-102nd St / BMT / Broadway Jamaica
## 2  104th St-102nd St / BMT / Broadway Jamaica
## 3  104th St-102nd St / BMT / Broadway Jamaica
## 4  104th St-102nd St / BMT / Broadway Jamaica
## 5  104th St-102nd St / BMT / Broadway Jamaica
## 6  104th St-102nd St / BMT / Broadway Jamaica
## 7  104th St-102nd St / BMT / Broadway Jamaica
## 8  104th St-102nd St / BMT / Broadway Jamaica
## 9  104th St-102nd St / BMT / Broadway Jamaica
## 10 104th St-102nd St / BMT / Broadway Jamaica
dist.test <- bind_cols(loc.test, t3) %>%
  gather(key = "station.num", value = "station.id", 4:8) %>% 
  arrange(station.num)
# each of the 10 (long,lat) points is now paired with each of the 5 stations selected
dist.test
## # A tibble: 50 x 5
##      Borough Latitude Longitude station.num                station.id
##        <chr>    <dbl>     <dbl>       <chr>                     <chr>
##  1  BROOKLYN 40.67033 -73.90643       stat1 103rd St / IND / 8 Avenue
##  2  BROOKLYN 40.68917 -73.93788       stat1 103rd St / IND / 8 Avenue
##  3     BRONX 40.86760 -73.89588       stat1 103rd St / IND / 8 Avenue
##  4  BROOKLYN 40.61637 -74.03827       stat1 103rd St / IND / 8 Avenue
##  5 MANHATTAN 40.77141 -73.95263       stat1 103rd St / IND / 8 Avenue
##  6     BRONX 40.84643 -73.90940       stat1 103rd St / IND / 8 Avenue
##  7     BRONX 40.82724 -73.87406       stat1 103rd St / IND / 8 Avenue
##  8     BRONX 40.82610 -73.92577       stat1 103rd St / IND / 8 Avenue
##  9    QUEENS 40.75865 -73.80800       stat1 103rd St / IND / 8 Avenue
## 10    QUEENS 40.66638 -73.84769       stat1 103rd St / IND / 8 Avenue
## # ... with 40 more rows
# link the station.id to a (long,lat) pair for that station: 
dist.test.w.stat.loc <- dist.test %>% 
  inner_join(sub.loc.ada.annot, by = "station.id")
# now calculate distance between point and the station using the geosphere package:
dist.test.res <- dist.test.w.stat.loc  %>% 
  mutate(
    dist.to.stat = distGeo(
      # p1 = NYC 311 location
      p1 = as.matrix(dist.test.w.stat.loc %>% select(Longitude, Latitude)), 
      # p2 = subway station location
      p2 = as.matrix(dist.test.w.stat.loc  %>% select(avg.long, avg.lat))
      ) * 0.000621371  # convert meters to miles
    ) %>% 
  mutate(loc.id = paste(Borough, Latitude, Longitude, sep = " / "))

What’s the distance from the NYC 311 point to the closest station (out of the 5):

dist.test.nearest <- dist.test.res %>% 
  group_by(loc.id) %>% 
  summarize(nearest.stat = min(dist.to.stat)) %>% 
  ungroup() %>% 
  inner_join((dist.test.res %>% select(Longitude, Latitude, loc.id)), by = "loc.id") %>% 
  unique()

ggmap(nyc.map) +
  geom_point(data = dist.test.nearest, aes(x = Longitude, y = Latitude, color = nearest.stat), size = 3) +
  scale_color_viridis(option = "D")  +
  ylim(40.55, 40.915568) +  # NYC city limits latitude coordinates
  xlim(-74.1, -73.699215) +  # NYC city limits longitude coordinates
  xlab("longitude") +
  ylab("latitude") +
  ggtitle("Direct distance to nearest subway station from point in NYC")
## Warning: Removed 1 rows containing missing values (geom_rect).

Test run with a small fraction of the data seems to have worked, now to tackle the full data.

Run this chunk at your own risk. R took up >40GB of RAM at some points for me.

I exported the distance to any nearest subway station to any point and the distance to the nearest ada subway station into csv files. They’re imported in the next chunk.

nrow(nyc.311.loc)
# need to duplicate the station.id row that many times
stat.df <- as.data.frame(t1, stringsAsFactors = FALSE)
colnames(stat.df) <- paste("stat", 1:ncol(stat.df), sep = "")
# full df with all stations with enough rows for the nyc.311.loc df
stat.df.full <- stat.df[rep(seq_len(nrow(stat.df)), (nrow(nyc.311.loc) / 2)), ]
dist.full <- bind_cols(nyc.311.loc, stat.df.full)
# need to use my work computer to get through this step, takes up too much RAM
dist.full.t2 <- dist.full %>%
  gather(key = "station.num", value = "station.id", 4:ncol(dist.full)) %>% 
  arrange(station.num)
dist.full.w.stat.loc <- dist.full.t2 %>% 
  inner_join(sub.loc.ada.annot, by = "station.id")
rm(dist.full, dist.full.t2, stat.df.full)  # remove some monster variables
gc()  # get some memory back
dist.full.res <- dist.full.w.stat.loc  %>% 
  mutate(
    dist.to.stat = distGeo(
      # p1 = NYC 311 location
      p1 = as.matrix(dist.full.w.stat.loc %>% select(Longitude, Latitude)), 
      # p2 = subway station location
      p2 = as.matrix(dist.full.w.stat.loc  %>% select(avg.long, avg.lat))
      ) * 0.000621371  # convert meters to miles
    ) %>% 
  mutate(loc.id = paste(Borough, Latitude, Longitude, sep = " / "))
# freeing up memory again
rm(dist.test.w.stat.loc)
gc()
dim(dist.full.res)
# dim: 192529530 x 13
dist.full.nearest <- dist.full.res %>% 
  group_by(loc.id) %>% 
  summarize(nearest.stat = min(dist.to.stat)) %>% 
  ungroup() %>% 
  inner_join((dist.full.res %>% select(Longitude, Latitude, loc.id)), by = "loc.id") %>% 
  unique()
# write_csv(dist.full.nearest, path = "dist_to_any_nearest_station_nyc311.csv")
dist.ada.nearest <- dist.full.res %>% 
  filter(ada.class == "ada.complnt") %>% 
  group_by(loc.id) %>% 
  summarize(nearest.stat = min(dist.to.stat)) %>% 
  ungroup() %>% 
  inner_join((dist.full.res %>% select(Longitude, Latitude, loc.id)), by = "loc.id") %>% 
  unique()
# write_csv(dist.ada.nearest, path = "dist_to_nearest_ada_station_nyc311.csv")
rm(dist.full.res)
gc()

Import of the resulting csv files (to avoid evaluating that chunk above again):

dist.full.nearest <- read_csv(file = "../data/dist_to_any_nearest_station_nyc311.csv")
dist.ada.nearest <- read_csv(file = "../data/dist_to_nearest_ada_station_nyc311.csv")

Plots of the results!

Distance to any type of station from a particular point in NYC:

dist.full.nearest %>% 
  ggplot(aes(x = Longitude, y = Latitude, color = nearest.stat)) +
  geom_point(size = 0.2) +
  scale_color_viridis(option = "B") +
  theme(panel.background = element_rect(fill = 'grey75')) +
  coord_quickmap() +
  ggtitle("Direct distance to any nearest subway station from point in NYC")

dist.full.nearest %>% 
  ggplot(aes(x = Longitude, y = Latitude, color = nearest.stat)) +
  geom_point(size = 0.2, alpha = 0.2) +
  scale_color_viridis(option = "B") +
  theme(panel.background = element_rect(fill = 'grey75')) +
  coord_quickmap() +
  ggtitle("Direct distance to any nearest subway station from point in NYC")

ggmap(nyc.map) +
  geom_point(data = dist.full.nearest, aes(x = Longitude, y = Latitude, color = nearest.stat), size = 0.2) +
  scale_color_viridis(option = "B")  +
  ylim(40.55, 40.915568) +  # NYC city limits latitude coordinates
  xlim(-74.1, -73.699215) +  # NYC city limits longitude coordinates
  xlab("longitude") +
  ylab("latitude") +
  ggtitle("Direct distance to any nearest subway station from point in NYC")
## Warning: Removed 1 rows containing missing values (geom_rect).

Uh oh, there seems to be an overlap problem.

dist.ada.nearest %>% 
  ggplot(aes(x = Longitude, y = Latitude, color = nearest.stat)) +
  geom_point(size = 0.2) +
  scale_color_viridis(option = "B") +
  theme(panel.background = element_rect(fill = 'grey75')) +
  coord_quickmap() +
  ggtitle("Direct distance to nearest ADA-accessible subway station from point in NYC")

dist.ada.nearest %>% 
  ggplot(aes(x = Longitude, y = Latitude, color = nearest.stat)) +
  geom_point(size = 0.2, alpha = 0.2) +
  scale_color_viridis(option = "B") +
  theme(panel.background = element_rect(fill = 'grey75')) +
  coord_quickmap() +
  ggtitle("Direct distance to nearest ADA-accessible subway station from point in NYC")

ggmap(nyc.map) +
  geom_point(data = dist.ada.nearest, aes(x = Longitude, y = Latitude, color = nearest.stat), size = 0.2) +
  scale_color_viridis(option = "B")  +
  ylim(40.55, 40.915568) +  # NYC city limits latitude coordinates
  xlim(-74.1, -73.699215) +  # NYC city limits longitude coordinates
  xlab("longitude") +
  ylab("latitude") +
  ggtitle("Direct distance to nearest ADA-accessible subway station from point in NYC")
## Warning: Removed 1 rows containing missing values (geom_rect).

Unsurprisingly, some areas are just far from any subway stations and that carries over into what areas are far from ADA stations.

colnames(dist.full.nearest)[2:4] <- c("near.any.stat", "any.long", "any.lat")
colnames(dist.ada.nearest)[2:4] <- c("near.ada.stat", "ada.long", "ada.lat")
dist.combo <- dist.full.nearest %>% 
  full_join(dist.ada.nearest, by = "loc.id")
# sanity checks for joining:
sapply(dist.full.nearest, anyNA)
##        loc.id near.any.stat      any.long       any.lat 
##         FALSE         FALSE         FALSE         FALSE
sapply(dist.ada.nearest, anyNA)
##        loc.id near.ada.stat      ada.long       ada.lat 
##         FALSE         FALSE         FALSE         FALSE
sapply(dist.combo, anyNA)
##        loc.id near.any.stat      any.long       any.lat near.ada.stat 
##         FALSE         FALSE         FALSE         FALSE         FALSE 
##      ada.long       ada.lat 
##         FALSE         FALSE
all.equal(dist.combo$any.lat, dist.combo$ada.lat)
## [1] TRUE
all.equal(dist.combo$any.long, dist.combo$ada.long)
## [1] TRUE
# joining seems ok
dist.combo <- dist.combo %>% 
  select(loc.id, any.long, any.lat, near.any.stat, near.ada.stat) %>% 
  mutate(any.vs.ada.dist = near.ada.stat - near.any.stat) %>% 
  mutate(any.vs.ada.ratio = near.ada.stat / near.any.stat)

Distance viz

dist.combo %>% 
  ggplot(aes(x = any.vs.ada.dist)) +
  geom_histogram()

dist.combo %>% 
  ggplot(aes(x = any.vs.ada.ratio)) +
  geom_histogram()

# okay so some of the ratio values are really, really large - these are probably points right on top of stations that are not ADA-certified
dist.combo %>% 
  filter(any.vs.ada.ratio < 50) %>%
  ggplot(aes(x = any.vs.ada.ratio)) +
  geom_histogram()

dist.combo %>% 
  ggplot(aes(x = any.long, y = any.lat, color = any.vs.ada.dist)) +
  geom_point(size = 0.2, alpha = 0.2) +
  scale_color_viridis(option = "B") +
  theme(panel.background = element_rect(fill = 'grey75')) +
  coord_quickmap() +
  ggtitle("Direct distance to nearest ADA-accessible subway station from point in NYC")

dist.combo %>% 
  filter(any.vs.ada.ratio < 100) %>%  # this number was trial and error
  ggplot(aes(x = any.long, y = any.lat, color = any.vs.ada.ratio)) +
  geom_point(size = 0.2, alpha = 0.2) +
  scale_color_viridis(option = "B") +
  theme(panel.background = element_rect(fill = 'grey75')) +
  coord_quickmap() +
  ggtitle("Direct distance to nearest ADA-accessible subway station from point in NYC")

# I was right, looks like its points right on top of stations that are really "far" from ada-accessible stations

dist.combo %>% 
  filter(any.vs.ada.ratio > 5) %>%  # this number was trial and error
  mutate(any.vs.ada.ratio = 5) %>% 
  bind_rows(
    dist.combo %>% 
      filter(any.vs.ada.ratio < 5)
    ) %>% 
  ggplot(aes(x = any.long, y = any.lat, color = any.vs.ada.ratio)) +
  geom_point(size = 0.2, alpha = 0.2) +
  scale_color_viridis(option = "D") +
  theme(panel.background = element_rect(fill = 'grey75')) +
  coord_quickmap() +
  ggtitle("Direct distance to nearest ADA-accessible subway station from point in NYC")

To tackle - Separate distance calculations by borough for both the random point and the station, in case “closest” stations are across a river. Probably can keep Queens and Brooklyn together? -